Intro

In this script, I take the collated stomach data set and calculate aggregates (feeding ratio, total weight of prey groups) and predictor variables for diet data, aggregate to get 1 stomach = 1 row per prey type (not prey individual). I also select only the columns I need for model fitting, join environmental covariates and cpue covariates for cod and flounder, and lastly saduria biomass densities.

Load packages & source functions

# Load libraries, install if needed
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.5
library(readxl)
library(tidylog)
library(RCurl)
library(RColorBrewer)
#> Warning: package 'RColorBrewer' was built under R version 4.0.5
library(patchwork)
library(janitor)
library(forcats)
library(gapminder)
library(viridis)
library(ggridges)
library(raster)
library(icesDatras)
library(ggalluvial)
library(ggrepel)
library(ncdf4)
library(chron)
library(rnaturalearth)
library(rnaturalearthdata)
library(mapplots)
library(geosphere)
library(quantreg)
#> Warning in .recacheSubclasses(def@className, def, env): undefined subclass
#> "numericVector" of class "Mnumeric"; definition not updated
library(brms)
#> Warning: package 'Rcpp' was built under R version 4.0.5
library(sdmTMB)
library(sp)
library(raster)
library(modelr)
options(mc.cores = parallel::detectCores()) 

# Source code for map plots
source("/Users/maxlindmark/Dropbox/Max work/R/cod_interactions/R/functions/map_plot.R")

# Load cache
# qwraps2::lazyload_cache_dir(path = "R/prepare_data/04_density_models_for_covars_cache/html")

theme_set(theme_plot())

# Continuous colors
options(ggplot2.continuous.colour = "viridis")

# Discrete colors
scale_colour_discrete <- function(...) {
  scale_colour_brewer(palette = "Dark2")
}

scale_fill_discrete <- function(...) {
  scale_fill_brewer(palette = "Dark2")
}

Read stomach data

dat <- read_csv("data/clean/clean_stomach_data.csv")
#> Rows: 5885 Columns: 29
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (7): pred_id, predator_latin_name, species, ices_rect, haul_id, pred_we...
#> dbl (22): pred_weight_g, pred_length_cm, year, quarter, month, day, subdiv, ...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Which body sizes to use as density covariates?

We know from the data exploration that cod in contrast to flounder has a clear ontogenetic diet shift towards pelagic prey,

dat_logistic <- dat %>%
  filter(tot_feeding_ratio > 0 & species == "Cod") %>% 
  mutate(fish_presence = ifelse(pelagic_feeding_ratio == 0, 0, 1))
#> filter: removed 3,219 rows (55%), 2,666 rows remaining
#> mutate: new variable 'fish_presence' (double) with 2 unique values and 0% NA

ggplot(dat_logistic, aes(pred_length_cm, factor(fish_presence))) + 
  geom_jitter(alpha = 0.5)


glm1 <- glm(fish_presence ~ pred_length_cm,
            data = dat_logistic,
            family = binomial)

coef(glm1)
#>    (Intercept) pred_length_cm 
#>     -4.6362921      0.1211906

a <- coef(glm1)[1]
b <- coef(glm1)[2]

l50 <- -a/b

l50
#> (Intercept) 
#>    38.25622

lrPerc <- function(cf,p) (log(p/(1-p))-cf[1])/cf[2]

l20 <- lrPerc(coef(glm1), 0.20)

l20
#> (Intercept) 
#>    26.81725

nd <- data.frame(pred_length_cm = seq_range(dat_logistic$pred_length_cm, n = 100))

nd$pred <- predict(glm1, newdata = nd, type = "response")

# Add glm prediction to plot
p1 <- ggplot(dat_logistic, aes(pred_length_cm, fish_presence)) + 
  geom_jitter(alpha = 0.3, shape = 21, size = 0.7, fill = "white", stroke = 1.1, width = 0, height = 0.4) +
  geom_line(data = nd, aes(pred_length_cm, pred), color = "tomato", size = 1.2) +
  geom_segment(x = l50, y = -10, xend = l50, yend = 0.5, color = "tomato", size = 1, aes(linetype = "L50")) +
  geom_segment(x = l20, y = -10, xend = l20, yend = 0.2, color = "tomato", size = 1, aes(linetype = "L20")) +
  labs(y = "Fish presence", x = "Predator length (cm)", linetype = "") + 
  coord_cartesian(expand = 0) +
  scale_y_continuous(breaks = c(0, 1)) +
  scale_x_continuous(breaks = seq(0, 80, 5), lim = c(0, 85)) +
  theme_plot()

#ggsave("figures/supp/glm.pdf", width = 17, height = 17, units = "cm")

# Filter cod based on what prey they have in stomachs
cod_pelagic <- dat %>%
  filter(species == "Cod" & pelagic_feeding_ratio/tot_feeding_ratio > 0.5)
#> filter: removed 5,021 rows (85%), 864 rows remaining

cod_pelagic2 <- dat %>%
  filter(species == "Cod" & pelagic_feeding_ratio == tot_feeding_ratio)
#> filter: removed 4,745 rows (81%), 1,140 rows remaining

m1 <- rq(pred_length_cm ~ 1, data = cod_pelagic, tau = 0.05)
summary(m1)
#> 
#> Call: rq(formula = pred_length_cm ~ 1, tau = 0.05, data = cod_pelagic)
#> 
#> tau: [1] 0.05
#> 
#> Coefficients:
#> (Intercept) 
#>          26

intercept <- as.vector(m1$coefficients["(Intercept)"])

p2 <- ggplot() +
  geom_histogram(data = dat %>% filter(species == "Cod"), aes(pred_length_cm, fill = "All cod"), alpha = 0.5) +
  geom_histogram(data = cod_pelagic, aes(pred_length_cm, fill = "Majority pelagic biomass"), alpha = 0.5) +
  geom_vline(xintercept = intercept, linetype = 2, color = brewer.pal("Dark2", n = 3)[2], size = 1.2) +
  coord_cartesian(expand = 0) +
  scale_x_continuous(breaks = seq(0, 80, 5), lim = c(0, 85)) +
  labs(fill = "", x = "Predator length (cm)", y = "Count")
#> filter: removed 2,579 rows (44%), 3,306 rows remaining

p1 / p2
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> Warning: Removed 2 rows containing missing values (geom_bar).
#> Removed 2 rows containing missing values (geom_bar).

ggsave("figures/supp/l10_quant_sizedist.pdf", width = 17, height = 17, units = "cm")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> Warning: Removed 2 rows containing missing values (geom_bar).
#> Removed 2 rows containing missing values (geom_bar).

# fit2 <- brm(bf(predator_length_cm ~ 1, quantile = 0.1), data = cod_pelagic, family = asym_laplace())
# summary(fit2)
# plot(fit2)

Add environmental covariates (oxygen, temperature)

Oxygen

# Downloaded from here: https://resources.marine.copernicus.eu/?option=com_csw&view=details&product_id=BALTICSEA_REANALYSIS_BIO_003_012
# Extract raster points: https://gisday.wordpress.com/2014/03/24/extract-raster-values-from-points-using-r/comment-page-1/
# https://rpubs.com/boyerag/297592
# https://pjbartlein.github.io/REarthSysSci/netCDF.html#get-a-variable
# Open the netCDF file
ncin <- nc_open("data/NEMO_Nordic_SCOBI/dataset-reanalysis-scobi-monthlymeans_1664182224542.nc")

print(ncin)
#> File data/NEMO_Nordic_SCOBI/dataset-reanalysis-scobi-monthlymeans_1664182224542.nc (NC_FORMAT_CLASSIC):
#> 
#>      1 variables (excluding dimension variables):
#>         float o2b[longitude,latitude,time]   
#>             long_name: Sea_floor_Dissolved_Oxygen_Concentration
#>             missing_value: NaN
#>             standard_name: mole_concentration_of_dissolved_molecular_oxygen_in_sea_water
#>             units: mmol m-3
#>             _FillValue: NaN
#>             _ChunkSizes: 1
#>              _ChunkSizes: 523
#>              _ChunkSizes: 383
#> 
#>      3 dimensions:
#>         time  Size:336
#>             axis: T
#>             long_name: Validity time
#>             standard_name: time
#>             units: days since 1950-01-01 00:00:00
#>             calendar: gregorian
#>             _ChunkSizes: 512
#>             _CoordinateAxisType: Time
#>             valid_min: 15721.5
#>             valid_max: 25917.5
#>         latitude  Size:523
#>             axis: Y
#>             standard_name: latitude
#>             long_name: latitude
#>             units: degrees_north
#>             _CoordinateAxisType: Lat
#>             valid_min: 48.49169921875
#>             valid_max: 65.8914184570312
#>         longitude  Size:383
#>             standard_name: longitude
#>             long_name: longitude
#>             units: degrees_east
#>             axis: X
#>             _CoordinateAxisType: Lon
#>             valid_min: 9.01375484466553
#>             valid_max: 30.2357654571533
#> 
#>     24 global attributes:
#>         references: http://www.smhi.se
#>         institution: Swedish Meterological and Hydrological Institute
#>         history: See source and creation_date attributees
#>         Conventions: CF-1.5
#>         contact: servicedesk_cmems@mercator-ocean.eu
#>         comment: Provided by SMHI as a Copernicus Marine Environment Monitoring Service production unit
#>         bullentin_type: reanalysis
#>         cmems_product_id: BALTICSEA_REANALYSIS_BIO_003_012
#>         title: CMEMS V4 Reanalysis: SCOBI model 3D fields (monthly means)
#>         FROM_ORIGINAL_FILE__easternmost_longitude: 30.2357654571533
#>         FROM_ORIGINAL_FILE__northernmost_latitude: 65.8914184570312
#>         FROM_ORIGINAL_FILE__westernmost_longitude: 9.01375484466553
#>         FROM_ORIGINAL_FILE__southernmost_latitude: 48.49169921875
#>         shallowest_depth: 1.50136542320251
#>         deepest_depth: 711.059204101562
#>         source: SMHI reanalysis run NORDIC-NS2_1d_20201201_20201201
#>         file_quality_index: 1
#>         creation_date: 2021-11-09 UTC
#>         bullentin_date: 20201201
#>         start_date: 2020-12-01 UTC
#>         stop_date: 2020-12-01 UTC
#>         start_time: 00:00 UTC
#>         stop_time: 00:00 UTC
#>         _CoordSysBuilder: ucar.nc2.dataset.conv.CF1Convention

# Get longitude and latitude
lon <- ncvar_get(ncin,"longitude")
nlon <- dim(lon)
head(lon)
#> [1] 9.013755 9.069310 9.124865 9.180420 9.235975 9.291530

lat <- ncvar_get(ncin,"latitude")
nlat <- dim(lat)
head(lat)
#> [1] 48.49170 48.52503 48.55836 48.59170 48.62503 48.65836

# Get time
time <- ncvar_get(ncin,"time")
time
#>   [1] 15721.5 15751.0 15780.5 15811.0 15841.5 15872.0 15902.5 15933.5 15964.0
#>  [10] 15994.5 16025.0 16055.5 16086.5 16116.0 16145.5 16176.0 16206.5 16237.0
#>  [19] 16267.5 16298.5 16329.0 16359.5 16390.0 16420.5 16451.5 16481.0 16510.5
#>  [28] 16541.0 16571.5 16602.0 16632.5 16663.5 16694.0 16724.5 16755.0 16785.5
#>  [37] 16816.5 16846.5 16876.5 16907.0 16937.5 16968.0 16998.5 17029.5 17060.0
#>  [46] 17090.5 17121.0 17151.5 17182.5 17212.0 17241.5 17272.0 17302.5 17333.0
#>  [55] 17363.5 17394.5 17425.0 17455.5 17486.0 17516.5 17547.5 17577.0 17606.5
#>  [64] 17637.0 17667.5 17698.0 17728.5 17759.5 17790.0 17820.5 17851.0 17881.5
#>  [73] 17912.5 17942.0 17971.5 18002.0 18032.5 18063.0 18093.5 18124.5 18155.0
#>  [82] 18185.5 18216.0 18246.5 18277.5 18307.5 18337.5 18368.0 18398.5 18429.0
#>  [91] 18459.5 18490.5 18521.0 18551.5 18582.0 18612.5 18643.5 18673.0 18702.5
#> [100] 18733.0 18763.5 18794.0 18824.5 18855.5 18886.0 18916.5 18947.0 18977.5
#> [109] 19008.5 19038.0 19067.5 19098.0 19128.5 19159.0 19189.5 19220.5 19251.0
#> [118] 19281.5 19312.0 19342.5 19373.5 19403.0 19432.5 19463.0 19493.5 19524.0
#> [127] 19554.5 19585.5 19616.0 19646.5 19677.0 19707.5 19738.5 19768.5 19798.5
#> [136] 19829.0 19859.5 19890.0 19920.5 19951.5 19982.0 20012.5 20043.0 20073.5
#> [145] 20104.5 20134.0 20163.5 20194.0 20224.5 20255.0 20285.5 20316.5 20347.0
#> [154] 20377.5 20408.0 20438.5 20469.5 20499.0 20528.5 20559.0 20589.5 20620.0
#> [163] 20650.5 20681.5 20712.0 20742.5 20773.0 20803.5 20834.5 20864.0 20893.5
#> [172] 20924.0 20954.5 20985.0 21015.5 21046.5 21077.0 21107.5 21138.0 21168.5
#> [181] 21199.5 21229.5 21259.5 21290.0 21320.5 21351.0 21381.5 21412.5 21443.0
#> [190] 21473.5 21504.0 21534.5 21565.5 21595.0 21624.5 21655.0 21685.5 21716.0
#> [199] 21746.5 21777.5 21808.0 21838.5 21869.0 21899.5 21930.5 21960.0 21989.5
#> [208] 22020.0 22050.5 22081.0 22111.5 22142.5 22173.0 22203.5 22234.0 22264.5
#> [217] 22295.5 22325.0 22354.5 22385.0 22415.5 22446.0 22476.5 22507.5 22538.0
#> [226] 22568.5 22599.0 22629.5 22660.5 22690.5 22720.5 22751.0 22781.5 22812.0
#> [235] 22842.5 22873.5 22904.0 22934.5 22965.0 22995.5 23026.5 23056.0 23085.5
#> [244] 23116.0 23146.5 23177.0 23207.5 23238.5 23269.0 23299.5 23330.0 23360.5
#> [253] 23391.5 23421.0 23450.5 23481.0 23511.5 23542.0 23572.5 23603.5 23634.0
#> [262] 23664.5 23695.0 23725.5 23756.5 23786.0 23815.5 23846.0 23876.5 23907.0
#> [271] 23937.5 23968.5 23999.0 24029.5 24060.0 24090.5 24121.5 24151.5 24181.5
#> [280] 24212.0 24242.5 24273.0 24303.5 24334.5 24365.0 24395.5 24426.0 24456.5
#> [289] 24487.5 24517.0 24546.5 24577.0 24607.5 24638.0 24668.5 24699.5 24730.0
#> [298] 24760.5 24791.0 24821.5 24852.5 24882.0 24911.5 24942.0 24972.5 25003.0
#> [307] 25033.5 25064.5 25095.0 25125.5 25156.0 25186.5 25217.5 25247.0 25276.5
#> [316] 25307.0 25337.5 25368.0 25398.5 25429.5 25460.0 25490.5 25521.0 25551.5
#> [325] 25582.5 25612.5 25642.5 25673.0 25703.5 25734.0 25764.5 25795.5 25826.0
#> [334] 25856.5 25887.0 25917.5

tunits <- ncatt_get(ncin,"time","units")
nt <- dim(time)
nt
#> [1] 336
tunits
#> $hasatt
#> [1] TRUE
#> 
#> $value
#> [1] "days since 1950-01-01 00:00:00"

# Get oxygen
dname <- "o2b"

oxy_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(oxy_array)
#> [1] 383 523 336

# Get global attributes
title <- ncatt_get(ncin,0,"title")
institution <- ncatt_get(ncin,0,"institution")
datasource <- ncatt_get(ncin,0,"source")
references <- ncatt_get(ncin,0,"references")
history <- ncatt_get(ncin,0,"history")
Conventions <- ncatt_get(ncin,0,"Conventions")

# Convert time: split the time units string into fields
tustr <- strsplit(tunits$value, " ")
tdstr <- strsplit(unlist(tustr)[3], "-")
tmonth <- as.integer(unlist(tdstr)[2])
tday <- as.integer(unlist(tdstr)[3])
tyear <- as.integer(unlist(tdstr)[1])

# Here I deviate from the guide a little bit. Save this info:
dates <- chron(time, origin = c(tmonth, tday, tyear))

# Crop the date variable
months <- as.numeric(substr(dates, 2, 3))
years <- as.numeric(substr(dates, 8, 9))
years <- ifelse(years > 90, 1900 + years, 2000 + years)

# Replace netCDF fill values with NA's
oxy_array[oxy_array == fillvalue$value] <- NA

# Next, we need to work with the months that correspond to the quarters that we use.
# loop through each time step, and if it is a good month save it as a raster.
# First get the index of months that correspond to Q4
months
#>   [1]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1
#>  [26]  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2
#>  [51]  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3
#>  [76]  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4
#> [101]  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5
#> [126]  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6
#> [151]  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7
#> [176]  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8
#> [201]  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9
#> [226] 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10
#> [251] 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11
#> [276] 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12
#> [301]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1
#> [326]  2  3  4  5  6  7  8  9 10 11 12

index_keep_q1 <- which(months < 4)
index_keep_q4 <- which(months > 9)

oxy_q1 <- oxy_array[, , index_keep_q1]
oxy_q4 <- oxy_array[, , index_keep_q4]

months_keep_q1 <- months[index_keep_q1]
months_keep_q4 <- months[index_keep_q4]

years_keep_q1 <- years[index_keep_q1]
years_keep_q4 <- years[index_keep_q4]

# Now we have an array with data for that quarter
# We need to now calculate the average within a year.
# Get a sequence that takes every third value between 1: number of months (length)
loop_seq_q1 <- seq(1, dim(oxy_q1)[3], by = 3)
loop_seq_q4 <- seq(1, dim(oxy_q4)[3], by = 3)

# Create objects that will hold data
dlist_q1 <- list()
dlist_q4 <- list()

oxy_1 <- c()
oxy_2 <- c()
oxy_3 <- c()
oxy_ave_q1 <- c()

oxy_10 <- c()
oxy_11 <- c()
oxy_12 <- c()
oxy_ave_q4 <- c()

# Now average by quarter. The vector loop_seq_q1 is 1, 4, 7 etc. So first i is 1, 2, 3,
# which is the index we want. 

for(i in loop_seq_q1) { # We can use q1 as looping index, doesn't matter!
  
  oxy_1 <- oxy_q1[, , (i)]
  oxy_2 <- oxy_q1[, , (i + 1)]
  oxy_3 <- oxy_q1[, , (i + 2)]
  
  oxy_10 <- oxy_q4[, , (i)]
  oxy_11 <- oxy_q4[, , (i + 1)]
  oxy_12 <- oxy_q4[, , (i + 2)]
  
  oxy_ave_q1 <- (oxy_1 + oxy_2 + oxy_3) / 3
  oxy_ave_q4 <- (oxy_10 + oxy_11 + oxy_12) / 3
    
  list_pos_q1 <- ((i/3) - (1/3)) + 1 # to get index 1:n(years)
  list_pos_q4 <- ((i/3) - (1/3)) + 1 # to get index 1:n(years)
  
  dlist_q1[[list_pos_q1]] <- oxy_ave_q1
  dlist_q4[[list_pos_q4]] <- oxy_ave_q4

}

# Now name the lists with the year:
names(dlist_q1) <- unique(years_keep_q1)
names(dlist_q4) <- unique(years_keep_q4)

# Now I need to make a loop where I extract the raster value for each year...
# The cpue data is called dat so far in this script

# Filter years in the cpue data frame to only have the years I have oxygen for
d_sub_oxy_q1 <- dat %>% filter(quarter == 1) %>% filter(year %in% names(dlist_q1)) %>% droplevels()
#> filter: removed 2,028 rows (34%), 3,857 rows remaining
#> filter: no rows removed
d_sub_oxy_q4 <- dat %>% filter(quarter == 4) %>% filter(year %in% names(dlist_q4)) %>% droplevels()
#> filter: removed 3,857 rows (66%), 2,028 rows remaining
#> filter: no rows removed

# Create data holding object
oxy_data_list_q1 <- list()
oxy_data_list_q4 <- list()

# ... And for the oxygen raster
raster_list_q1 <- list()
raster_list_q4 <- list()

# Create factor year for indexing the list in the loop
d_sub_oxy_q1$year_f <- as.factor(d_sub_oxy_q1$year)
d_sub_oxy_q4$year_f <- as.factor(d_sub_oxy_q4$year)

sort(unique(d_sub_oxy_q1$year_f))
#> [1] 2016 2017 2018 2020
#> Levels: 2016 2017 2018 2020
sort(unique(d_sub_oxy_q4$year_f))
#> [1] 2015 2016 2017 2018 2019
#> Levels: 2015 2016 2017 2018 2019

# We have missing years. Add fake data to not break the loop
d_sub_oxy_q1_extra <- d_sub_oxy_q4 %>% dplyr::select(year, lat, lon) %>% head(2) %>% mutate(year = c(2015, 2019))
#> mutate: changed one value (50%) of 'year' (0 new NA)
d_sub_oxy_q1 <- bind_rows(d_sub_oxy_q1, d_sub_oxy_q1_extra) %>% 
  mutate(year_f = as.factor(year))
#> mutate: changed 2 values (<1%) of 'year_f' (2 fewer NA)

d_sub_oxy_q4_extra <- d_sub_oxy_q4 %>% dplyr::select(year, lat, lon) %>% head(1) %>% mutate(year = 2020)
#> mutate: changed one value (100%) of 'year' (0 new NA)
d_sub_oxy_q4 <- bind_rows(d_sub_oxy_q4, d_sub_oxy_q4_extra) %>% 
  mutate(year_f = as.factor(year))
#> mutate: changed one value (<1%) of 'year_f' (1 fewer NA)

# Loop through each year and extract raster values for the data points
for(i in as.factor(c(2015:2020))) {
  
  # Set plot limits
  ymin = 54; ymax = 58; xmin = 12; xmax = 22

  # Subset a year
  oxy_slice_q1 <- dlist_q1[[i]]
  oxy_slice_q4 <- dlist_q4[[i]]
  
  # Create raster for that year (i)
  r_q1 <- raster(t(oxy_slice_q1), xmn = min(lon), xmx = max(lon), ymn = min(lat), ymx = max(lat),
                 crs = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs+ towgs84=0,0,0"))
  r_q4 <- raster(t(oxy_slice_q4), xmn = min(lon), xmx = max(lon), ymn = min(lat), ymx = max(lat),
                 crs = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs+ towgs84=0,0,0"))
  
  # Flip...
  r_q1 <- flip(r_q1, direction = 'y')
  r_q4 <- flip(r_q4, direction = 'y')
  
  plot(r_q1, main = paste(i, "Q1"))
  plot(r_q4, main = paste(i, "Q4"))
  
  # Filter the same year (i) in the data and select only coordinates
  d_slice_q1 <- d_sub_oxy_q1 %>% filter(year_f == i) %>% dplyr::select(lon, lat)
  d_slice_q4 <- d_sub_oxy_q4 %>% filter(year_f == i) %>% dplyr::select(lon, lat)
  
  # Make into a SpatialPoints object
  data_sp_q1 <- SpatialPoints(d_slice_q1)
  data_sp_q4 <- SpatialPoints(d_slice_q4)
  
  # Extract raster value (oxygen)
  rasValue_q1 <- raster::extract(r_q1, data_sp_q1)
  rasValue_q4 <- raster::extract(r_q4, data_sp_q4)
  
  # Now we want to plot the results of the raster extractions by plotting the cpue
  # data points over a raster and saving it for each year.
  # Make the SpatialPoints object into a raster again (for plot)
  df_q1 <- as.data.frame(data_sp_q1)
  df_q4 <- as.data.frame(data_sp_q4)
  
  # Add in the raster value in the df holding the coordinates for the cpue data
  d_slice_q1$oxy <- rasValue_q1
  d_slice_q4$oxy <- rasValue_q4
  
  # Add in which year
  d_slice_q1$year <- i
  d_slice_q4$year <- i

  # Now the unit of oxygen is mmol/m3. I want it to be ml/L. The original model is in unit ml/L
  # and it's been converted by the data host. Since it was converted without accounting for
  # pressure or temperature, I can simply use the following conversion factor:
  # 1 ml/l = 103/22.391 = 44.661 μmol/l -> 1 ml/l = 0.044661 mmol/l = 44.661 mmol/m^3 -> 0.0223909 ml/l = 1mmol/m^3
  # https://ocean.ices.dk/tools/unitconversion.aspx

  d_slice_q1$oxy <- d_slice_q1$oxy * 0.0223909
  d_slice_q4$oxy <- d_slice_q4$oxy * 0.0223909
    
  # Create a index for the data last where we store all years (because our loop index
  # i is not continuous, we can't use it directly)
  index_q1 <- as.numeric(as.character(d_slice_q1$year))[1] - 1992
  index_q4 <- as.numeric(as.character(d_slice_q4$year))[1] - 1992
  
  # Add each years' data in the list
  oxy_data_list_q1[[index_q1]] <- d_slice_q1
  oxy_data_list_q4[[index_q4]] <- d_slice_q4
  
}

#> filter: removed 3,858 rows (>99%), one row remaining
#> filter: removed 1,509 rows (74%), 520 rows remaining

#> filter: removed 3,093 rows (80%), 766 rows remaining
#> filter: removed 1,818 rows (90%), 211 rows remaining

#> filter: removed 3,084 rows (80%), 775 rows remaining
#> filter: removed 1,483 rows (73%), 546 rows remaining

#> filter: removed 2,759 rows (71%), 1,100 rows remaining
#> filter: removed 1,930 rows (95%), 99 rows remaining

#> filter: removed 3,858 rows (>99%), one row remaining
#> filter: removed 1,377 rows (68%), 652 rows remaining

#> filter: removed 2,643 rows (68%), 1,216 rows remaining
#> filter: removed 2,028 rows (>99%), one row remaining


# Now create a data frame from the list of all annual values
big_dat_oxy_q1 <- dplyr::bind_rows(oxy_data_list_q1)
big_dat_oxy_q4 <- dplyr::bind_rows(oxy_data_list_q4)
big_dat_oxy <- bind_rows(mutate(big_dat_oxy_q1, quarter = 1),
                         mutate(big_dat_oxy_q4, quarter = 4))
#> mutate: new variable 'quarter' (double) with one unique value and 0% NA
#> mutate: new variable 'quarter' (double) with one unique value and 0% NA

# Create an ID for matching the temperature data with the data
big_dat_oxy$id_env <- paste(big_dat_oxy$year, big_dat_oxy$quarter, big_dat_oxy$lon, big_dat_oxy$lat, sep = "_")

big_dat_oxy <- big_dat_oxy %>% distinct(id_env, .keep_all = TRUE)
#> distinct: removed 5,689 rows (97%), 199 rows remaining

Temperature

# Open the netCDF file
ncin <- nc_open("data/NEMO_Nordic_SCOBI/dataset-reanalysis-nemo-monthlymeans_1664183191233.nc")
                                        
print(ncin)
#> File data/NEMO_Nordic_SCOBI/dataset-reanalysis-nemo-monthlymeans_1664183191233.nc (NC_FORMAT_CLASSIC):
#> 
#>      1 variables (excluding dimension variables):
#>         float bottomT[longitude,latitude,time]   
#>             standard_name: sea_water_potential_temperature_at_sea_floor
#>             units: degrees_C
#>             long_name: Sea floor potential temperature
#>             missing_value: NaN
#>             _FillValue: NaN
#>             _ChunkSizes: 1
#>              _ChunkSizes: 523
#>              _ChunkSizes: 383
#> 
#>      3 dimensions:
#>         time  Size:336
#>             axis: T
#>             long_name: Validity time
#>             standard_name: time
#>             units: days since 1950-01-01 00:00:00
#>             calendar: gregorian
#>             _ChunkSizes: 512
#>             _CoordinateAxisType: Time
#>             valid_min: 15721.5
#>             valid_max: 25917.5
#>         latitude  Size:523
#>             axis: Y
#>             standard_name: latitude
#>             long_name: latitude
#>             units: degrees_north
#>             _CoordinateAxisType: Lat
#>             valid_min: 48.49169921875
#>             valid_max: 65.8914184570312
#>         longitude  Size:383
#>             standard_name: longitude
#>             long_name: longitude
#>             units: degrees_east
#>             axis: X
#>             _CoordinateAxisType: Lon
#>             valid_min: 9.01375484466553
#>             valid_max: 30.2357654571533
#> 
#>     24 global attributes:
#>         references: http://www.smhi.se
#>         institution: Swedish Meterological and Hydrological Institute
#>         history: See source and creation_date attributees
#>         Conventions: CF-1.5
#>         contact: servicedesk_cmems@mercator-ocean.eu
#>         comment: Provided by SMHI as a Copernicus Marine Environment Monitoring Service production unit
#>         bullentin_type: reanalysis
#>         cmems_product_id: BALTICSEA_REANALYSIS_PHY_003_011
#>         title: CMEMS V4 Reanalysis: NEMO model 3D fields (monthly means)
#>         FROM_ORIGINAL_FILE__easternmost_longitude: 30.2357654571533
#>         FROM_ORIGINAL_FILE__northernmost_latitude: 65.8914184570312
#>         FROM_ORIGINAL_FILE__westernmost_longitude: 9.01375484466553
#>         FROM_ORIGINAL_FILE__southernmost_latitude: 48.49169921875
#>         shallowest_depth: 1.50136542320251
#>         deepest_depth: 711.059204101562
#>         source: SMHI reanalysis run NORDIC-NS2_1d_20201201_20201201
#>         file_quality_index: 1
#>         creation_date: 2021-11-09 UTC
#>         bullentin_date: 20201201
#>         start_date: 2020-12-01 UTC
#>         stop_date: 2020-12-01 UTC
#>         start_time: 00:00 UTC
#>         stop_time: 00:00 UTC
#>         _CoordSysBuilder: ucar.nc2.dataset.conv.CF1Convention

# Get longitude and latitude
lon <- ncvar_get(ncin,"longitude")
nlon <- dim(lon)
head(lon)
#> [1] 9.013755 9.069310 9.124865 9.180420 9.235975 9.291530

lat <- ncvar_get(ncin,"latitude")
nlat <- dim(lat)
head(lat)
#> [1] 48.49170 48.52503 48.55836 48.59170 48.62503 48.65836

# Get time
time <- ncvar_get(ncin,"time")
time
#>   [1] 15721.5 15751.0 15780.5 15811.0 15841.5 15872.0 15902.5 15933.5 15964.0
#>  [10] 15994.5 16025.0 16055.5 16086.5 16116.0 16145.5 16176.0 16206.5 16237.0
#>  [19] 16267.5 16298.5 16329.0 16359.5 16390.0 16420.5 16451.5 16481.0 16510.5
#>  [28] 16541.0 16571.5 16602.0 16632.5 16663.5 16694.0 16724.5 16755.0 16785.5
#>  [37] 16816.5 16846.5 16876.5 16907.0 16937.5 16968.0 16998.5 17029.5 17060.0
#>  [46] 17090.5 17121.0 17151.5 17182.5 17212.0 17241.5 17272.0 17302.5 17333.0
#>  [55] 17363.5 17394.5 17425.0 17455.5 17486.0 17516.5 17547.5 17577.0 17606.5
#>  [64] 17637.0 17667.5 17698.0 17728.5 17759.5 17790.0 17820.5 17851.0 17881.5
#>  [73] 17912.5 17942.0 17971.5 18002.0 18032.5 18063.0 18093.5 18124.5 18155.0
#>  [82] 18185.5 18216.0 18246.5 18277.5 18307.5 18337.5 18368.0 18398.5 18429.0
#>  [91] 18459.5 18490.5 18521.0 18551.5 18582.0 18612.5 18643.5 18673.0 18702.5
#> [100] 18733.0 18763.5 18794.0 18824.5 18855.5 18886.0 18916.5 18947.0 18977.5
#> [109] 19008.5 19038.0 19067.5 19098.0 19128.5 19159.0 19189.5 19220.5 19251.0
#> [118] 19281.5 19312.0 19342.5 19373.5 19403.0 19432.5 19463.0 19493.5 19524.0
#> [127] 19554.5 19585.5 19616.0 19646.5 19677.0 19707.5 19738.5 19768.5 19798.5
#> [136] 19829.0 19859.5 19890.0 19920.5 19951.5 19982.0 20012.5 20043.0 20073.5
#> [145] 20104.5 20134.0 20163.5 20194.0 20224.5 20255.0 20285.5 20316.5 20347.0
#> [154] 20377.5 20408.0 20438.5 20469.5 20499.0 20528.5 20559.0 20589.5 20620.0
#> [163] 20650.5 20681.5 20712.0 20742.5 20773.0 20803.5 20834.5 20864.0 20893.5
#> [172] 20924.0 20954.5 20985.0 21015.5 21046.5 21077.0 21107.5 21138.0 21168.5
#> [181] 21199.5 21229.5 21259.5 21290.0 21320.5 21351.0 21381.5 21412.5 21443.0
#> [190] 21473.5 21504.0 21534.5 21565.5 21595.0 21624.5 21655.0 21685.5 21716.0
#> [199] 21746.5 21777.5 21808.0 21838.5 21869.0 21899.5 21930.5 21960.0 21989.5
#> [208] 22020.0 22050.5 22081.0 22111.5 22142.5 22173.0 22203.5 22234.0 22264.5
#> [217] 22295.5 22325.0 22354.5 22385.0 22415.5 22446.0 22476.5 22507.5 22538.0
#> [226] 22568.5 22599.0 22629.5 22660.5 22690.5 22720.5 22751.0 22781.5 22812.0
#> [235] 22842.5 22873.5 22904.0 22934.5 22965.0 22995.5 23026.5 23056.0 23085.5
#> [244] 23116.0 23146.5 23177.0 23207.5 23238.5 23269.0 23299.5 23330.0 23360.5
#> [253] 23391.5 23421.0 23450.5 23481.0 23511.5 23542.0 23572.5 23603.5 23634.0
#> [262] 23664.5 23695.0 23725.5 23756.5 23786.0 23815.5 23846.0 23876.5 23907.0
#> [271] 23937.5 23968.5 23999.0 24029.5 24060.0 24090.5 24121.5 24151.5 24181.5
#> [280] 24212.0 24242.5 24273.0 24303.5 24334.5 24365.0 24395.5 24426.0 24456.5
#> [289] 24487.5 24517.0 24546.5 24577.0 24607.5 24638.0 24668.5 24699.5 24730.0
#> [298] 24760.5 24791.0 24821.5 24852.5 24882.0 24911.5 24942.0 24972.5 25003.0
#> [307] 25033.5 25064.5 25095.0 25125.5 25156.0 25186.5 25217.5 25247.0 25276.5
#> [316] 25307.0 25337.5 25368.0 25398.5 25429.5 25460.0 25490.5 25521.0 25551.5
#> [325] 25582.5 25612.5 25642.5 25673.0 25703.5 25734.0 25764.5 25795.5 25826.0
#> [334] 25856.5 25887.0 25917.5

tunits <- ncatt_get(ncin,"time","units")
nt <- dim(time)
nt
#> [1] 336
tunits
#> $hasatt
#> [1] TRUE
#> 
#> $value
#> [1] "days since 1950-01-01 00:00:00"

# Get temperature
dname <- "bottomT"

temp_array <- ncvar_get(ncin,dname)
dlname <- ncatt_get(ncin,dname,"long_name")
dunits <- ncatt_get(ncin,dname,"units")
fillvalue <- ncatt_get(ncin,dname,"_FillValue")
dim(temp_array)
#> [1] 383 523 336

# Get global attributes
title <- ncatt_get(ncin,0,"title")
institution <- ncatt_get(ncin,0,"institution")
datasource <- ncatt_get(ncin,0,"source")
references <- ncatt_get(ncin,0,"references")
history <- ncatt_get(ncin,0,"history")
Conventions <- ncatt_get(ncin,0,"Conventions")

# Convert time: split the time units string into fields
tustr <- strsplit(tunits$value, " ")
tdstr <- strsplit(unlist(tustr)[3], "-")
tmonth <- as.integer(unlist(tdstr)[2])
tday <- as.integer(unlist(tdstr)[3])
tyear <- as.integer(unlist(tdstr)[1])

# Here I deviate from the guide a little bit. Save this info:
dates <- chron(time, origin = c(tmonth, tday, tyear))

# Crop the date variable
months <- as.numeric(substr(dates, 2, 3))
years <- as.numeric(substr(dates, 8, 9))
years <- ifelse(years > 90, 1900 + years, 2000 + years)

# Replace netCDF fill values with NA's
temp_array[temp_array == fillvalue$value] <- NA

# Next, we need to work with the months that correspond to the quarters that we use.
# loop through each time step, and if it is a good month save it as a raster.
# First get the index of months that correspond to Q4
months
#>   [1]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1
#>  [26]  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2
#>  [51]  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3
#>  [76]  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4
#> [101]  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5
#> [126]  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6
#> [151]  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7
#> [176]  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8
#> [201]  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9
#> [226] 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10
#> [251] 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11
#> [276] 12  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12
#> [301]  1  2  3  4  5  6  7  8  9 10 11 12  1  2  3  4  5  6  7  8  9 10 11 12  1
#> [326]  2  3  4  5  6  7  8  9 10 11 12

index_keep_q1 <- which(months < 4)
index_keep_q4 <- which(months > 9)

temp_q1 <- temp_array[, , index_keep_q1]
temp_q4 <- temp_array[, , index_keep_q4]

months_keep_q1 <- months[index_keep_q1]
months_keep_q4 <- months[index_keep_q4]

years_keep_q1 <- years[index_keep_q1]
years_keep_q4 <- years[index_keep_q4]

# Now we have an array with data for that quarter
# We need to now calculate the average within a year.
# Get a sequence that takes every third value between 1: number of months (length)
loop_seq_q1 <- seq(1, dim(temp_q1)[3], by = 3)
loop_seq_q4 <- seq(1, dim(temp_q4)[3], by = 3)

# Create objects that will hold data
dlist_q1 <- list()
dlist_q4 <- list()

temp_1 <- c()
temp_2 <- c()
temp_3 <- c()
temp_ave_q1 <- c()

temp_10 <- c()
temp_11 <- c()
temp_12 <- c()
temp_ave_q4 <- c()

# Now average by quarter. The vector loop_seq_q1 is 1, 4, 7 etc. So first i is 1, 2, 3,
# which is the index we want. 

for(i in loop_seq_q1) {
  
  temp_1 <- temp_q1[, , (i)]
  temp_2 <- temp_q1[, , (i + 1)]
  temp_3 <- temp_q1[, , (i + 2)]
  
  temp_10 <- temp_q4[, , (i)]
  temp_11 <- temp_q4[, , (i + 1)]
  temp_12 <- temp_q4[, , (i + 2)]
  
  temp_ave_q1 <- (temp_1 + temp_2 + temp_3) / 3
  temp_ave_q4 <- (temp_10 + temp_11 + temp_12) / 3
  
  list_pos_q1 <- ((i/3) - (1/3)) + 1 # to get index 1:n(years)
  list_pos_q4 <- ((i/3) - (1/3)) + 1 # to get index 1:n(years)
  
  dlist_q1[[list_pos_q1]] <- temp_ave_q1
  dlist_q4[[list_pos_q4]] <- temp_ave_q4
  
}

# Now name the lists with the year:
names(dlist_q1) <- unique(years_keep_q1)
names(dlist_q4) <- unique(years_keep_q4)

# Now I need to make a loop where I extract the raster value for each year...
# The cpue data is called dat so far in this script

# Filter years in the cpue data frame to only have the years I have temperature for
d_sub_temp_q1 <- dat %>% filter(quarter == 1) %>% filter(year %in% names(dlist_q1)) %>% droplevels()
#> filter: removed 2,028 rows (34%), 3,857 rows remaining
#> filter: no rows removed
d_sub_temp_q4 <- dat %>% filter(quarter == 4) %>% filter(year %in% names(dlist_q4)) %>% droplevels()
#> filter: removed 3,857 rows (66%), 2,028 rows remaining
#> filter: no rows removed

# Create data holding object
temp_data_list_q1 <- list()
temp_data_list_q4 <- list()

# ... And for the temperature raster
raster_list_q1 <- list()
raster_list_q4 <- list()

# Create factor year for indexing the list in the loop
d_sub_temp_q1$year_f <- as.factor(d_sub_temp_q1$year)
d_sub_temp_q4$year_f <- as.factor(d_sub_temp_q4$year)

# We have missing years. Add fake data to not break the loop
d_sub_temp_q1_extra <- d_sub_temp_q4 %>% dplyr::select(year, lat, lon) %>% head(2) %>% mutate(year = c(2015, 2019))
#> mutate: changed one value (50%) of 'year' (0 new NA)
d_sub_temp_q1 <- bind_rows(d_sub_temp_q1, d_sub_temp_q1_extra) %>% 
  mutate(year_f = as.factor(year))
#> mutate: changed 2 values (<1%) of 'year_f' (2 fewer NA)

d_sub_temp_q4_extra <- d_sub_temp_q4 %>% dplyr::select(year, lat, lon) %>% head(1) %>% mutate(year = 2020)
#> mutate: changed one value (100%) of 'year' (0 new NA)
d_sub_temp_q4 <- bind_rows(d_sub_temp_q4, d_sub_temp_q4_extra) %>% 
  mutate(year_f = as.factor(year))
#> mutate: changed one value (<1%) of 'year_f' (1 fewer NA)

# Loop through each year and extract raster values for the data points
for(i in as.factor(c(2015:2020))) {
  
  # Set plot limits
  ymin = 54; ymax = 58; xmin = 12; xmax = 22
  
  # Subset a year
  temp_slice_q1 <- dlist_q1[[i]]
  temp_slice_q4 <- dlist_q4[[i]]
  
  # Create raster for that year (i)
  r_q1 <- raster(t(temp_slice_q1), xmn = min(lon), xmx = max(lon), ymn = min(lat), ymx = max(lat),
                 crs = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs+ towgs84=0,0,0"))
  r_q4 <- raster(t(temp_slice_q4), xmn = min(lon), xmx = max(lon), ymn = min(lat), ymx = max(lat),
                 crs = CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs+ towgs84=0,0,0"))
  
  # Flip...
  r_q1 <- flip(r_q1, direction = 'y')
  r_q4 <- flip(r_q4, direction = 'y')
  
  plot(r_q1, main = paste(i, "Q1"))
  plot(r_q4, main = paste(i, "Q4"))
  
  # Filter the same year (i) in the cpue data and select only coordinates
  d_slice_q1 <- d_sub_temp_q1 %>% filter(year_f == i) %>% dplyr::select(lon, lat)
  d_slice_q4 <- d_sub_temp_q4 %>% filter(year_f == i) %>% dplyr::select(lon, lat)
  
  # Make into a SpatialPoints object
  data_sp_q1 <- SpatialPoints(d_slice_q1)
  data_sp_q4 <- SpatialPoints(d_slice_q4)
  
  # Extract raster value (temperature)
  rasValue_q1 <- raster::extract(r_q1, data_sp_q1)
  rasValue_q4 <- raster::extract(r_q4, data_sp_q4)
  
  # Now we want to plot the results of the raster extractions by plotting the cpue
  # data points over a raster and saving it for each year.
  # Make the SpatialPoints object into a raster again (for pl)
  df_q1 <- as.data.frame(data_sp_q1)
  df_q4 <- as.data.frame(data_sp_q4)
  
  # Add in the raster value in the df holding the coordinates for the cpue data
  d_slice_q1$temp <- rasValue_q1
  d_slice_q4$temp <- rasValue_q4
  
  # Add in which year
  d_slice_q1$year <- i
  d_slice_q4$year <- i
  
  # Create a index for the data last where we store all years (because our loop index
  # i is not continuous, we can't use it directly)
  index_q1 <- as.numeric(d_slice_q1$year)[1] - 1992
  index_q4 <- as.numeric(d_slice_q4$year)[1] - 1992
  
  # Add each years' data in the list
  temp_data_list_q1[[index_q1]] <- d_slice_q1
  temp_data_list_q4[[index_q4]] <- d_slice_q4
  
  # Save to check each year is ok! First convert the raster to points for plotting
  # (so that we can use ggplot)
  map_q1 <- rasterToPoints(r_q1)
  map_q4 <- rasterToPoints(r_q4)
  
  # Make the points a dataframe for ggplot
  df_rast_q1 <- data.frame(map_q1)
  df_rast_q4 <- data.frame(map_q4)
  
  # Rename y-variable and add year
  df_rast_q1 <- df_rast_q1 %>% rename("temp" = "layer") %>% mutate(year = i)
  df_rast_q4 <- df_rast_q4 %>% rename("temp" = "layer") %>% mutate(year = i)
  
  # Add each years' raster data frame in the list
  raster_list_q1[[index_q1]] <- df_rast_q1
  raster_list_q4[[index_q4]] <- df_rast_q4

}

#> filter: removed 3,858 rows (>99%), one row remaining
#> filter: removed 1,509 rows (74%), 520 rows remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA

#> filter: removed 3,093 rows (80%), 766 rows remaining
#> filter: removed 1,818 rows (90%), 211 rows remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA

#> filter: removed 3,084 rows (80%), 775 rows remaining
#> filter: removed 1,483 rows (73%), 546 rows remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA

#> filter: removed 2,759 rows (71%), 1,100 rows remaining
#> filter: removed 1,930 rows (95%), 99 rows remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA

#> filter: removed 3,858 rows (>99%), one row remaining
#> filter: removed 1,377 rows (68%), 652 rows remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA

#> filter: removed 2,643 rows (68%), 1,216 rows remaining
#> filter: removed 2,028 rows (>99%), one row remaining
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA
#> rename: renamed one variable (temp)
#> mutate: new variable 'year' (character) with one unique value and 0% NA


# Now create a data frame from the list of all annual values
big_dat_temp_q1 <- dplyr::bind_rows(temp_data_list_q1)
big_dat_temp_q4 <- dplyr::bind_rows(temp_data_list_q4)
big_dat_temp <- bind_rows(mutate(big_dat_temp_q1, quarter = 1),
                          mutate(big_dat_temp_q4, quarter = 4))
#> mutate: new variable 'quarter' (double) with one unique value and 0% NA
#> mutate: new variable 'quarter' (double) with one unique value and 0% NA

# Create an ID for matching the temperature data with the cpue data
big_dat_temp$id_env <- paste(big_dat_temp$year, big_dat_temp$quarter, big_dat_temp$lon, big_dat_temp$lat, sep = "_")

big_dat_temp <- big_dat_temp %>% distinct(id_env, .keep_all = TRUE)
#> distinct: removed 5,689 rows (97%), 199 rows remaining
env_dat <- left_join(big_dat_oxy,
                     big_dat_temp %>% dplyr::select(id_env, temp),
                     by = "id_env") %>% 
  mutate(year = as.numeric(year))
#> left_join: added one column (temp)
#>            > rows only in x     0
#>            > rows only in y  (  0)
#>            > matched rows     199
#>            >                 =====
#>            > rows total       199
#> mutate: converted 'year' from character to double (0 new NA)

dat$id_env <- paste(dat$year, dat$quarter, dat$lon, dat$lat, sep = "_")

# Now join these data with the full_dat
dat <- left_join(dat, env_dat)
#> Joining, by = c("year", "quarter", "lat", "lon", "id_env")
#> left_join: added 2 columns (oxy, temp)
#>            > rows only in x       0
#>            > rows only in y  (    3)
#>            > matched rows     5,885
#>            >                 =======
#>            > rows total       5,885

# Temperature
dat %>% 
  distinct(haul_id, .keep_all = TRUE) %>% 
  ggplot(aes(y = lat, x = lon, color = temp)) +
  geom_point() +
  coord_sf(xlim = c(xmin, xmax),
           ylim = c(ymin, ymax)) +
  facet_grid(quarter ~ year) +
  theme(axis.text.x = element_text(angle = 90)) +
  NULL
#> distinct: removed 5,685 rows (97%), 200 rows remaining


# Oxygen
dat %>% 
  distinct(haul_id, .keep_all = TRUE) %>% 
  ggplot(aes(y = lat, x = lon, color = oxy)) +
  geom_point() +
  coord_sf(xlim = c(xmin, xmax),
           ylim = c(ymin, ymax)) +
  facet_wrap(~ year) +
  theme(axis.text.x = element_text(angle = 90)) +
  NULL
#> distinct: removed 5,685 rows (97%), 200 rows remaining

Save

small_cod_stomach <- dat %>% filter(pred_length_cm <= 25 & species == "Cod")
#> filter: removed 4,732 rows (80%), 1,153 rows remaining
large_cod_stomach <- dat %>% filter(pred_length_cm > 25 & species == "Cod")
#> filter: removed 3,732 rows (63%), 2,153 rows remaining
flounder_stomach <- dat %>% filter(species == "Flounder")
#> filter: removed 3,306 rows (56%), 2,579 rows remaining

write_csv(small_cod_stomach, "data/clean/small_cod_stomach.csv")
write_csv(large_cod_stomach, "data/clean/large_cod_stomach.csv")
write_csv(flounder_stomach, "data/clean/flounder_stomach.csv")